#eliminating redundant columns
MasterData<- select(MasterData, "ID..","Sex","Linguistic.Background","Final.Grade","Incorrect...unaware","Per_correct","Per_Aware","Time.Spent..HW.","Time.Spent..Pronunciation.Practice.","Time.Spent..LearnSmart.","Total.HW...Correct","Total.LS...Complete")
Error: Strings must match column names. Unknown columns: Per_Aware

#Time spent on Connect homework, Percentage of correct homework responses
ggplot(MasterData, aes(Time_hw,Per_correct_hw))+ geom_point() + stat_sum(aes(group = 1))
#Time spent on Connect homework, Percentage of correct homework responses
ggplot(MasterData, aes(Time_hw,Per_correct_hw))+ geom_point() + stat_sum(aes(group = 1))

#Time spent on Connect homework, Time spent on LearnSmart adaptive activities
ggplot(MasterData, aes(Time_hw,Time_LS))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of correct homework responses, Final course grade
ggplot(MasterData, aes(Per_correct_hw,Final_grade))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of correct LearnSmart responses, Percentage of awareness of correct/incorrect responses
ggplot(MasterData, aes(Per_correct,Per_aware))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of completion of assigned LearnSmart activities, Final course grade
ggplot(MasterData, aes(Per_complete_LS,Final_grade))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of completion of assigned LearnSmart activities, Percentage of incorrect and unaware LearnSmart responses
ggplot(MasterData, aes(Per_complete_LS,IU))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of completion of assigned LearnSmart activities, Percentage of incorrect and unaware LearnSmart responses
ggplot(MasterData, aes(Per_complete_LS,IU))+ geom_point() + stat_sum(aes(group = 1))

#Time spent on LearnSmart adaptive activities, Final course grade
ggplot(MasterData, aes(Time_LS,Final_grade))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of incorrect and unaware LearnSmart responses, Final course grade
ggplot(MasterData, aes(IU,Final_grade))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of incorrect and unaware LearnSmart responses, Time spent on LearnSmart adaptive activities
ggplot(MasterData, aes(IU,Time_LS))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of incorrect and unaware LearnSmart responses, Time spent on Connect homework
ggplot(MasterData, aes(IU,Time_hw))+ geom_point() + stat_sum(aes(group = 1))

sapply(ScaledData[,4:12], sd)
    Final_grade              IU     Per_correct       Per_aware         Time_hw       Time_pron         Time_LS 
              1               1               1               1               1               1               1 
 Per_correct_hw Per_complete_LS 
              1               1 
#Creating the Within Group Sum of Squares function
wssplot <- function(data, nc=15, seed=1234){
  wss <- (nrow(data)-1)*sum(apply(data,2,var))
  for (i in 2:nc){
    set.seed(seed)
    wss[i] <- sum(kmeans(data, centers=i)$withinss)}
  plot(1:nc, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares")}
#Create the WSS Graph
wssplot(ScaledData[,4:12],nc=15,seed=1234)

#Create and examine several different possible cluster solutions
threeclusterkmeans<-kmeans(ScaledData[,4:12], 3, nstart=10)
threeclusterkmeans
fourclusterkmeans<-kmeans(ScaledData[,4:12], 4, nstart=10)
fourclusterkmeans
#Assign the clusters for each observation for k=3,4,5 to a new dataframe
Clusters<-data.frame(MasterData, threeclusterkmeans$cluster, fourclusterkmeans$cluster, fiveclusterkmeans$cluster)
Error in data.frame(MasterData, threeclusterkmeans$cluster, fourclusterkmeans$cluster,  : 
  object 'fiveclusterkmeans' not found
#Graph the different solutions - Three Clusters.
#Shapes indicate Language Background. To change which variable is marked by shape, change the name of the variable in "shape=Language"
ggplot(Clusters, aes(Time_hw,Per_correct_hw, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))

ggplot(Clusters, aes(Time_hw,Time_LS, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))

ggplot(Clusters, aes(Per_correct_hw,Final_grade, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))

ggplot(Clusters, aes(Per_correct,Per_aware, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))

ggplot(Clusters, aes(Per_complete_LS,Final_grade, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))

ggplot(Clusters, aes(Per_complete_LS,IU, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))

ggplot(Clusters, aes(Time_LS,Final_grade, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))

ggplot(Clusters, aes(IU,Final_grade, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))

ggplot(Clusters, aes(IU,Time_LS, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))

ggplot(Clusters, aes(IU,Time_hw, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))

#Graph the different solutions - Four Clusters.
#Shapes indicate Language Background. To change which variable is marked by shape, change the name of the variable in "shape=Language
ggplot(Clusters, aes(Time_hw,Per_correct_hw, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Sex, color = factor(fourclusterkmeans.cluster))))

ggplot(Clusters, aes(Time_hw,Time_LS, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Sex, color = factor(fourclusterkmeans.cluster))))

ggplot(Clusters, aes(Per_correct_hw,Final_grade, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Sex, color = factor(fourclusterkmeans.cluster))))

ggplot(Clusters, aes(Per_correct,Per_aware, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Sex, color = factor(fourclusterkmeans.cluster))))

ggplot(Clusters, aes(Per_complete_LS,Final_grade, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Sex, color = factor(fourclusterkmeans.cluster))))

ggplot(Clusters, aes(Per_complete_LS,IU, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Sex, color = factor(fourclusterkmeans.cluster))))

ggplot(Clusters, aes(Time_LS,Final_grade, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Sex, color = factor(fourclusterkmeans.cluster))))

ggplot(Clusters, aes(IU,Final_grade, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Sex, color = factor(fourclusterkmeans.cluster))))

ggplot(Clusters, aes(IU,Time_LS, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Sex, color = factor(fourclusterkmeans.cluster))))

ggplot(Clusters, aes(IU,Time_hw, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Sex, color = factor(fourclusterkmeans.cluster))))

#Graph the different solutions - Five Clusters.
#Shapes indicate Language Background. To change which variable is marked by shape, change the name of the variable in "shape=Language
ggplot(Clusters, aes(Time_hw,Per_correct_hw, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Time_hw,Time_LS, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_correct_hw,Final_grade, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_correct,Per_aware, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_complete_LS,Final_grade, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_complete_LS,IU, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Time_LS,Final_grade, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Final_grade, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Time_LS, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Time_hw, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
---
title: "R Notebook"
output: html_notebook
---
  
```{r}
#load relevant libraries
library(dplyr)
library(ggplot2)

#Import data into a new data frame
MasterData<-read.csv("~/downloads/FRE1120 Data Summary - RAW.csv")
#Removing students that were not required to do the LS exercises
MasterData<-MasterData[MasterData$LS.Required=="y",]
#Removing the few students with linguistic background different than English, Spnaish, or Creole due to very low numbers
MasterData<-MasterData[MasterData$Linguistic.Background %in% c("English","Creole","Spanish"),]
#Creating the calculated fields of Per_aware and Per_correct
MasterData <- MasterData %>% mutate(Per_correct = Correct...aware+Correct...unaware)
MasterData <- MasterData %>% mutate(Per_aware = Correct...aware+Incorrect...aware)
#eliminating redundant columns
MasterData<- select(MasterData, "ID..","Sex","Linguistic.Background","Final.Grade","Incorrect...unaware","Per_correct","Per_aware","Time.Spent..HW.","Time.Spent..Pronunciation.Practice.","Time.Spent..LearnSmart.","Total.HW...Correct","Total.LS...Complete")
#Changing column names
names(MasterData) <- c("ID","Sex","Language","Final_grade","IU","Per_correct","Per_aware","Time_hw","Time_pron","Time_LS","Per_correct_hw","Per_complete_LS")

```

```{r}
#Examine scatter plots for different variable combination pairs
#Creating scatter plots for all pairs
pairs(MasterData)
#Time spent on Connect homework, Percentage of correct homework responses
ggplot(MasterData, aes(Time_hw,Per_correct_hw))+ geom_point() + stat_sum(aes(group = 1))
#Time spent on Connect homework, Time spent on LearnSmart adaptive activities
ggplot(MasterData, aes(Time_hw,Time_LS))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of correct homework responses, Final course grade
ggplot(MasterData, aes(Per_correct_hw,Final_grade))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of correct LearnSmart responses, Percentage of awareness of correct/incorrect responses
ggplot(MasterData, aes(Per_correct,Per_aware))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of completion of assigned LearnSmart activities, Final course grade
ggplot(MasterData, aes(Per_complete_LS,Final_grade))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of completion of assigned LearnSmart activities, Percentage of incorrect and unaware LearnSmart responses
ggplot(MasterData, aes(Per_complete_LS,IU))+ geom_point() + stat_sum(aes(group = 1))
#Time spent on LearnSmart adaptive activities, Final course grade
ggplot(MasterData, aes(Time_LS,Final_grade))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of incorrect and unaware LearnSmart responses, Final course grade
ggplot(MasterData, aes(IU,Final_grade))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of incorrect and unaware LearnSmart responses, Time spent on LearnSmart adaptive activities
ggplot(MasterData, aes(IU,Time_LS))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of incorrect and unaware LearnSmart responses, Time spent on Connect homework
ggplot(MasterData, aes(IU,Time_hw))+ geom_point() + stat_sum(aes(group = 1))
```

```{r}
#Create new data frame for scaled data, removing non-clustering variables
ScaledData<-as.data.frame(cbind(MasterData[,1:3],scale(select(MasterData, Final_grade:Per_complete_LS))))
#Validating scaling by looking at means and standard deviations of the scaled columns
sapply(ScaledData[,4:12], mean)
sapply(ScaledData[,4:12], sd)
```

```{r}
#Creating the Within Group Sum of Squares function
wssplot <- function(data, nc=15, seed=1234){
  wss <- (nrow(data)-1)*sum(apply(data,2,var))
  for (i in 2:nc){
    set.seed(seed)
    wss[i] <- sum(kmeans(data, centers=i)$withinss)}
  plot(1:nc, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares")}

#Create the WSS Graph
wssplot(ScaledData[,4:12],nc=15,seed=1234)
```
```{r}
#Create and examine several different possible cluster solutions
threeclusterkmeans<-kmeans(ScaledData[,4:12], 3, nstart=10)
threeclusterkmeans
fourclusterkmeans<-kmeans(ScaledData[,4:12], 4, nstart=10)
fourclusterkmeans
```
```{r}
#Assign the clusters for each observation for k=3,4 to a new dataframe
Clusters<-data.frame(ScaledData, threeclusterkmeans$cluster, fourclusterkmeans$cluster)
```

```{r}
#Graph the different solutions - Three Clusters.
#Shapes indicate Language Background. To change which variable is marked by shape, change the name of the variable in "shape=Language"
ggplot(Clusters, aes(Time_hw,Per_correct_hw, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))
ggplot(Clusters, aes(Time_hw,Time_LS, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_correct_hw,Final_grade, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_correct,Per_aware, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_complete_LS,Final_grade, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_complete_LS,IU, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))
ggplot(Clusters, aes(Time_LS,Final_grade, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Final_grade, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Time_LS, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Time_hw, group = factor(threeclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(threeclusterkmeans.cluster))))
```

```{r}
#Graph the different solutions - Four Clusters.
#Shapes indicate Language Background. To change which variable is marked by shape, change the name of the variable in "shape=Language
ggplot(Clusters, aes(Time_hw,Per_correct_hw, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Language, color = factor(fourclusterkmeans.cluster))))
ggplot(Clusters, aes(Time_hw,Time_LS, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Language, color = factor(fourclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_correct_hw,Final_grade, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Language, color = factor(fourclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_correct,Per_aware, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Language, color = factor(fourclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_complete_LS,Final_grade, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Language, color = factor(fourclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_complete_LS,IU, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Language, color = factor(fourclusterkmeans.cluster))))
ggplot(Clusters, aes(Time_LS,Final_grade, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Language, color = factor(fourclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Final_grade, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Language, color = factor(fourclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Time_LS, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Language, color = factor(fourclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Time_hw, group = factor(fourclusterkmeans.cluster))) + geom_point((aes(size=2,shape=Language, color = factor(fourclusterkmeans.cluster))))
```
```{r}
#Graph the different solutions - Five Clusters.
#Shapes indicate Language Background. To change which variable is marked by shape, change the name of the variable in "shape=Language
ggplot(Clusters, aes(Time_hw,Per_correct_hw, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Time_hw,Time_LS, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_correct_hw,Final_grade, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_correct,Per_aware, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_complete_LS,Final_grade, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Per_complete_LS,IU, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(Time_LS,Final_grade, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Final_grade, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Time_LS, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))
ggplot(Clusters, aes(IU,Time_hw, group = factor(fiveclusterkmeans.cluster))) + geom_point((aes(shape=Language, color = factor(fiveclusterkmeans.cluster))))